import pandas as pd
# dataset
#https://www.kaggle.com/sdolezel/black-friday
1.1 Leitura Seletiva¶
pd.set_option('display.max_columns', 12)
df = pd.read_csv('train.csv')
df.head(3)
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
df_part = pd.read_csv('train.csv', nrows=5000)
df_part.shape
(5000, 12)
df_cols = pd.read_csv('train.csv', usecols=['Age', 'City_Category', 'Occupation', 'Purchase'])
df_cols
| Age | Occupation | City_Category | Purchase | |
|---|---|---|---|---|
| 0 | 0-17 | 10 | A | 8370 |
| 1 | 0-17 | 10 | A | 15200 |
| 2 | 0-17 | 10 | A | 1422 |
| 3 | 0-17 | 10 | A | 1057 |
| 4 | 55+ | 16 | C | 7969 |
| ... | ... | ... | ... | ... |
| 550063 | 51-55 | 13 | B | 368 |
| 550064 | 26-35 | 1 | C | 371 |
| 550065 | 26-35 | 15 | B | 137 |
| 550066 | 55+ | 1 | C | 365 |
| 550067 | 46-50 | 0 | B | 490 |
550068 rows × 4 columns
1.2 Ler Vários Arquivos em um DataFrame Único - Por linha¶
df_1 = pd.read_csv('train.csv')
print(df_1.shape)
df_1.head()
(550068, 12)
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
df_2 = pd.read_csv('test.csv')
print(df_2.shape)
df_2.head()
(233599, 11)
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000004 | P00128942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 11.0 | NaN |
| 1 | 1000009 | P00113442 | M | 26-35 | 17 | C | 0 | 0 | 3 | 5.0 | NaN |
| 2 | 1000010 | P00288442 | F | 36-45 | 1 | B | 4+ | 1 | 5 | 14.0 | NaN |
| 3 | 1000010 | P00145342 | F | 36-45 | 1 | B | 4+ | 1 | 4 | 9.0 | NaN |
| 4 | 1000011 | P00053842 | F | 26-35 | 1 | C | 1 | 0 | 4 | 5.0 | 12.0 |
from glob import glob
files = sorted(glob('*.csv'))
files
['test.csv', 'train.csv']
pd.concat((pd.read_csv(file) for file in files) , ignore_index = True)
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000004 | P00128942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 11.0 | NaN | NaN |
| 1 | 1000009 | P00113442 | M | 26-35 | 17 | C | 0 | 0 | 3 | 5.0 | NaN | NaN |
| 2 | 1000010 | P00288442 | F | 36-45 | 1 | B | 4+ | 1 | 5 | 14.0 | NaN | NaN |
| 3 | 1000010 | P00145342 | F | 36-45 | 1 | B | 4+ | 1 | 4 | 9.0 | NaN | NaN |
| 4 | 1000011 | P00053842 | F | 26-35 | 1 | C | 1 | 0 | 4 | 5.0 | 12.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 783662 | 1006033 | P00372445 | M | 51-55 | 13 | B | 1 | 1 | 20 | NaN | NaN | 368.0 |
| 783663 | 1006035 | P00375436 | F | 26-35 | 1 | C | 3 | 0 | 20 | NaN | NaN | 371.0 |
| 783664 | 1006036 | P00375436 | F | 26-35 | 15 | B | 4+ | 1 | 20 | NaN | NaN | 137.0 |
| 783665 | 1006038 | P00375436 | F | 55+ | 1 | C | 2 | 0 | 20 | NaN | NaN | 365.0 |
| 783666 | 1006039 | P00371644 | F | 46-50 | 0 | B | 4+ | 1 | 20 | NaN | NaN | 490.0 |
783667 rows × 12 columns
1.2 Ler Vários Arquivos em um DataFrame Único - Por coluna¶
from glob import glob
df_1 = pd.read_csv('train.csv')
df_2 = pd.read_csv('test.csv')
files = sorted(glob('*.csv'))
files
['test.csv', 'train.csv']
pd.concat((pd.read_csv(file) for file in files) , axis = 'columns').head()
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | ... | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000004.0 | P00128942 | M | 46-50 | 7.0 | B | ... | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000009.0 | P00113442 | M | 26-35 | 17.0 | C | ... | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000010.0 | P00288442 | F | 36-45 | 1.0 | B | ... | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000010.0 | P00145342 | F | 36-45 | 1.0 | B | ... | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000011.0 | P00053842 | F | 26-35 | 1.0 | C | ... | 4+ | 0 | 8 | NaN | NaN | 7969 |
5 rows × 23 columns
df = pd.read_csv('train.csv')
df.head(2)
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
new_col_names = {'Age': 'Idade','City_Category': 'Cidade' }
df = df.rename(new_col_names, axis = 'columns')
df.head()
| User_ID | Product_ID | Gender | Idade | Occupation | Cidade | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
df.columns = df.columns.str.replace('_', '-')
df.head()
| User-ID | Product-ID | Gender | Idade | Occupation | Cidade | Stay-In-Current-City-Years | Marital-Status | Product-Category-1 | Product-Category-2 | Product-Category-3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
Prefixo e Sufixo¶
df.add_prefix('A_')
| A_User-ID | A_Product-ID | A_Gender | A_Idade | A_Occupation | A_Cidade | A_Stay-In-Current-City-Years | A_Marital-Status | A_Product-Category-1 | A_Product-Category-2 | A_Product-Category-3 | A_Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 550063 | 1006033 | P00372445 | M | 51-55 | 13 | B | 1 | 1 | 20 | NaN | NaN | 368 |
| 550064 | 1006035 | P00375436 | F | 26-35 | 1 | C | 3 | 0 | 20 | NaN | NaN | 371 |
| 550065 | 1006036 | P00375436 | F | 26-35 | 15 | B | 4+ | 1 | 20 | NaN | NaN | 137 |
| 550066 | 1006038 | P00375436 | F | 55+ | 1 | C | 2 | 0 | 20 | NaN | NaN | 365 |
| 550067 | 1006039 | P00371644 | F | 46-50 | 0 | B | 4+ | 1 | 20 | NaN | NaN | 490 |
550068 rows × 12 columns
df.add_suffix('_B')
| User-ID_B | Product-ID_B | Gender_B | Idade_B | Occupation_B | Cidade_B | Stay-In-Current-City-Years_B | Marital-Status_B | Product-Category-1_B | Product-Category-2_B | Product-Category-3_B | Purchase_B | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 550063 | 1006033 | P00372445 | M | 51-55 | 13 | B | 1 | 1 | 20 | NaN | NaN | 368 |
| 550064 | 1006035 | P00375436 | F | 26-35 | 1 | C | 3 | 0 | 20 | NaN | NaN | 371 |
| 550065 | 1006036 | P00375436 | F | 26-35 | 15 | B | 4+ | 1 | 20 | NaN | NaN | 137 |
| 550066 | 1006038 | P00375436 | F | 55+ | 1 | C | 2 | 0 | 20 | NaN | NaN | 365 |
| 550067 | 1006039 | P00371644 | F | 46-50 | 0 | B | 4+ | 1 | 20 | NaN | NaN | 490 |
550068 rows × 12 columns
df.isna().sum()
User-ID 0 Product-ID 0 Gender 0 Idade 0 Occupation 0 Cidade 0 Stay-In-Current-City-Years 0 Marital-Status 0 Product-Category-1 0 Product-Category-2 173638 Product-Category-3 383247 Purchase 0 dtype: int64
df.isna().mean()
User-ID 0.000000 Product-ID 0.000000 Gender 0.000000 Idade 0.000000 Occupation 0.000000 Cidade 0.000000 Stay-In-Current-City-Years 0.000000 Marital-Status 0.000000 Product-Category-1 0.000000 Product-Category-2 0.315666 Product-Category-3 0.696727 Purchase 0.000000 dtype: float64
df = pd.read_csv('train.csv')
df.Product_Category_3.value_counts()
16.0 32636 15.0 28013 14.0 18428 17.0 16702 5.0 16658 8.0 12562 9.0 11579 12.0 9246 13.0 5459 6.0 4890 18.0 4629 4.0 1875 11.0 1805 10.0 1726 3.0 613 Name: Product_Category_3, dtype: int64
Moda, Média, Mediana¶
# Moda
moda = df['Product_Category_3'].mode()[0]
media = df['Product_Category_3'].mean()
mediana = df['Product_Category_3'].median()
print('Moda:', moda)
print('Média:', media)
print('Mediana:', mediana)
df['Product_Category_3'].fillna(value=media, inplace=True)
df.Product_Category_3.value_counts()
Moda: 16.0 Média: 12.668243206790512 Mediana: 14.0
12.668243 383247 16.000000 32636 15.000000 28013 14.000000 18428 17.000000 16702 5.000000 16658 8.000000 12562 9.000000 11579 12.000000 9246 13.000000 5459 6.000000 4890 18.000000 4629 4.000000 1875 11.000000 1805 10.000000 1726 3.000000 613 Name: Product_Category_3, dtype: int64
Pad, Backfill¶
df = pd.read_csv('train.csv')
df.head(10)
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
| 5 | 1000003 | P00193542 | M | 26-35 | 15 | A | 3 | 0 | 1 | 2.0 | NaN | 15227 |
| 6 | 1000004 | P00184942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 8.0 | 17.0 | 19215 |
| 7 | 1000004 | P00346142 | M | 46-50 | 7 | B | 2 | 1 | 1 | 15.0 | NaN | 15854 |
| 8 | 1000004 | P0097242 | M | 46-50 | 7 | B | 2 | 1 | 1 | 16.0 | NaN | 15686 |
| 9 | 1000005 | P00274942 | M | 26-35 | 20 | A | 1 | 1 | 8 | NaN | NaN | 7871 |
df = pd.read_csv('train.csv')
df['Product_Category_3'].fillna(method = "pad", inplace=True)
df.Product_Category_3.value_counts()
df.head(10)
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | 14.0 | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | 14.0 | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | 14.0 | 7969 |
| 5 | 1000003 | P00193542 | M | 26-35 | 15 | A | 3 | 0 | 1 | 2.0 | 14.0 | 15227 |
| 6 | 1000004 | P00184942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 8.0 | 17.0 | 19215 |
| 7 | 1000004 | P00346142 | M | 46-50 | 7 | B | 2 | 1 | 1 | 15.0 | 17.0 | 15854 |
| 8 | 1000004 | P0097242 | M | 46-50 | 7 | B | 2 | 1 | 1 | 16.0 | 17.0 | 15686 |
| 9 | 1000005 | P00274942 | M | 26-35 | 20 | A | 1 | 1 | 8 | NaN | 17.0 | 7871 |
df = pd.read_csv('train.csv')
df['Product_Category_2'].fillna(method = "backfill", inplace=True)
df.Product_Category_2.value_counts()
df.head(10)
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | 6.0 | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | 2.0 | NaN | 7969 |
| 5 | 1000003 | P00193542 | M | 26-35 | 15 | A | 3 | 0 | 1 | 2.0 | NaN | 15227 |
| 6 | 1000004 | P00184942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 8.0 | 17.0 | 19215 |
| 7 | 1000004 | P00346142 | M | 46-50 | 7 | B | 2 | 1 | 1 | 15.0 | NaN | 15854 |
| 8 | 1000004 | P0097242 | M | 46-50 | 7 | B | 2 | 1 | 1 | 16.0 | NaN | 15686 |
| 9 | 1000005 | P00274942 | M | 26-35 | 20 | A | 1 | 1 | 8 | 11.0 | NaN | 7871 |
df = pd.read_csv('train.csv')
df.isna().sum()
User_ID 0 Product_ID 0 Gender 0 Age 0 Occupation 0 City_Category 0 Stay_In_Current_City_Years 0 Marital_Status 0 Product_Category_1 0 Product_Category_2 173638 Product_Category_3 383247 Purchase 0 dtype: int64
# axis = 0 deleta linhas com missing values
# axis = 1 deleta colunas com missing values
df = pd.read_csv('train.csv')
df.dropna(axis=0, thresh= 11, inplace=True)
df.isna().sum()
User_ID 0 Product_ID 0 Gender 0 Age 0 Occupation 0 City_Category 0 Stay_In_Current_City_Years 0 Marital_Status 0 Product_Category_1 0 Product_Category_2 0 Product_Category_3 209609 Purchase 0 dtype: int64
Operadores¶
# Pode usar também:
# == para igual
# >= para maior igual
# <= para menor ou igual
# & para and
# | para ou
df = pd.read_csv('train.csv')
A_0_17 = df[(df.Age == '0-17') & (df.City_Category == 'A')]
A_0_17
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 85 | 1000019 | P00112542 | M | 0-17 | 10 | A | 3 | 0 | 1 | 11.0 | 15.0 | 7746 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 548970 | 1004478 | P00372445 | M | 0-17 | 12 | A | 1 | 0 | 20 | NaN | NaN | 246 |
| 548982 | 1004493 | P00372445 | F | 0-17 | 10 | A | 1 | 0 | 20 | NaN | NaN | 374 |
| 549222 | 1004807 | P00372445 | M | 0-17 | 2 | A | 1 | 0 | 20 | NaN | NaN | 359 |
| 549524 | 1005255 | P00372445 | M | 0-17 | 10 | A | 3 | 0 | 20 | NaN | NaN | 364 |
| 549562 | 1005302 | P00372445 | F | 0-17 | 10 | A | 1 | 0 | 20 | NaN | NaN | 139 |
2544 rows × 12 columns
4.1 Query¶
df = pd.read_csv('train.csv')
df = df.query('500 < Purchase < 800')
df
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 80 | 1000018 | P0094142 | F | 18-25 | 3 | B | 3 | 0 | 4 | 5.0 | NaN | 697 |
| 122 | 1000023 | P00112342 | M | 36-45 | 0 | B | 3 | 1 | 13 | 16.0 | NaN | 584 |
| 147 | 1000028 | P00084442 | F | 26-35 | 1 | C | 2 | 1 | 13 | 16.0 | NaN | 758 |
| 371 | 1000060 | P00132042 | M | 51-55 | 1 | C | 1 | 1 | 13 | 16.0 | NaN | 742 |
| 744 | 1000140 | P00084642 | F | 36-45 | 1 | B | 3 | 0 | 13 | 16.0 | NaN | 770 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 550023 | 1005972 | P00371644 | F | 26-35 | 20 | B | 0 | 0 | 20 | NaN | NaN | 598 |
| 550038 | 1005995 | P00375436 | F | 36-45 | 1 | C | 2 | 1 | 20 | NaN | NaN | 596 |
| 550039 | 1005996 | P00371644 | F | 26-35 | 0 | B | 1 | 1 | 20 | NaN | NaN | 595 |
| 550048 | 1006010 | P00371644 | M | 36-45 | 0 | C | 1 | 0 | 20 | NaN | NaN | 591 |
| 550061 | 1006029 | P00372445 | F | 26-35 | 1 | C | 1 | 1 | 20 | NaN | NaN | 599 |
5297 rows × 12 columns
4.2 isin¶
df = pd.read_csv('train.csv')
df = df[df['Stay_In_Current_City_Years'].isin(['4+', '3'])]
df
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
| 5 | 1000003 | P00193542 | M | 26-35 | 15 | A | 3 | 0 | 1 | 2.0 | NaN | 15227 |
| 19 | 1000008 | P00249542 | M | 26-35 | 12 | C | 4+ | 1 | 1 | 5.0 | 15.0 | 19614 |
| 20 | 1000008 | P00220442 | M | 26-35 | 12 | C | 4+ | 1 | 5 | 14.0 | NaN | 8584 |
| 21 | 1000008 | P00156442 | M | 26-35 | 12 | C | 4+ | 1 | 8 | NaN | NaN | 9872 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 550056 | 1006022 | P00375436 | M | 26-35 | 17 | C | 4+ | 0 | 20 | NaN | NaN | 254 |
| 550062 | 1006032 | P00372445 | M | 46-50 | 7 | A | 3 | 0 | 20 | NaN | NaN | 473 |
| 550064 | 1006035 | P00375436 | F | 26-35 | 1 | C | 3 | 0 | 20 | NaN | NaN | 371 |
| 550065 | 1006036 | P00375436 | F | 26-35 | 15 | B | 4+ | 1 | 20 | NaN | NaN | 137 |
| 550067 | 1006039 | P00371644 | F | 46-50 | 0 | B | 4+ | 1 | 20 | NaN | NaN | 490 |
180011 rows × 12 columns
4.3 ~ ou not¶
df = pd.read_csv('train.csv')
df = df[~df['Stay_In_Current_City_Years'].isin(['4+', '3'])]
df
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 6 | 1000004 | P00184942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 8.0 | 17.0 | 19215 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 550059 | 1006025 | P00370853 | F | 26-35 | 1 | B | 1 | 0 | 19 | NaN | NaN | 48 |
| 550060 | 1006026 | P00371644 | M | 36-45 | 6 | C | 1 | 1 | 20 | NaN | NaN | 494 |
| 550061 | 1006029 | P00372445 | F | 26-35 | 1 | C | 1 | 1 | 20 | NaN | NaN | 599 |
| 550063 | 1006033 | P00372445 | M | 51-55 | 13 | B | 1 | 1 | 20 | NaN | NaN | 368 |
| 550066 | 1006038 | P00375436 | F | 55+ | 1 | C | 2 | 0 | 20 | NaN | NaN | 365 |
370057 rows × 12 columns
5.1 Varias funções de agregação com Groupby¶
df[['Age','Gender','Purchase']].groupby(['Age','Gender']).agg(['mean','count', 'sum', 'median']).head()
| Purchase | |||||
|---|---|---|---|---|---|
| mean | count | sum | median | ||
| Age | Gender | ||||
| 0-17 | F | 8286.694085 | 3635 | 30122133 | 7800 |
| M | 9244.590496 | 7050 | 65174363 | 8113 | |
| 18-25 | F | 8345.538682 | 17463 | 145738142 | 7647 |
| M | 9445.586493 | 50686 | 478758997 | 8124 | |
| 26-35 | F | 8791.494875 | 32879 | 289055560 | 7909 |
5.1 Varias funções de agregação em diferentes grupos¶
df_dif_group = df[['Product_Category_1','Occupation','Purchase']].groupby('Occupation').agg({'Purchase':'mean', 'Product_Category_1':'sum'})
df_dif_group.head()
| Purchase | Product_Category_1 | |
|---|---|---|
| Occupation | ||
| 0 | 9173.296450 | 257553 |
| 1 | 8922.901463 | 181705 |
| 2 | 8773.293632 | 105368 |
| 3 | 9260.436527 | 65360 |
| 4 | 9216.389526 | 258181 |
df.dtypes
User_ID int64 Product_ID object Gender object Age object Occupation int64 City_Category object Stay_In_Current_City_Years object Marital_Status int64 Product_Category_1 int64 Product_Category_2 float64 Product_Category_3 float64 Purchase int64 dtype: object
df.select_dtypes(include='number').head()
| User_ID | Occupation | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|
| 0 | 1000001 | 10 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | 10 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | 10 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | 10 | 0 | 12 | 14.0 | NaN | 1057 |
| 6 | 1000004 | 7 | 1 | 1 | 8.0 | 17.0 | 19215 |
df.select_dtypes(include='object').head()
| Product_ID | Gender | Age | City_Category | Stay_In_Current_City_Years | |
|---|---|---|---|---|---|
| 0 | P00069042 | F | 0-17 | A | 2 |
| 1 | P00248942 | F | 0-17 | A | 2 |
| 2 | P00087842 | F | 0-17 | A | 2 |
| 3 | P00085442 | F | 0-17 | A | 2 |
| 6 | P00184942 | M | 46-50 | B | 2 |
df.select_dtypes(include=['number', 'object']).head()
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 6 | 1000004 | P00184942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 8.0 | 17.0 | 19215 |
df.select_dtypes(exclude='number').head()
| Product_ID | Gender | Age | City_Category | Stay_In_Current_City_Years | |
|---|---|---|---|---|---|
| 0 | P00069042 | F | 0-17 | A | 2 |
| 1 | P00248942 | F | 0-17 | A | 2 |
| 2 | P00087842 | F | 0-17 | A | 2 |
| 3 | P00085442 | F | 0-17 | A | 2 |
| 6 | P00184942 | M | 46-50 | B | 2 |
df = pd.read_csv('train.csv')
df['Stay_In_Current_City_Years'].unique()
array(['2', '4+', '3', '1', '0'], dtype=object)
df.dtypes
User_ID int64 Product_ID object Gender object Age object Occupation int64 City_Category object Stay_In_Current_City_Years object Marital_Status int64 Product_Category_1 int64 Product_Category_2 float64 Product_Category_3 float64 Purchase int64 dtype: object
#df.astype({'Stay-In-Current-City-Years': 'int'})
# converte os objects ou inputs inválidos na mudança para numerico em NaN
pd.to_numeric(df['Stay_In_Current_City_Years'], errors = 'coerce')
0 2.0
1 2.0
2 2.0
3 2.0
4 NaN
...
550063 1.0
550064 3.0
550065 NaN
550066 2.0
550067 NaN
Name: Stay_In_Current_City_Years, Length: 550068, dtype: float64
pd.to_numeric(df['Stay_In_Current_City_Years'], errors = 'coerce').fillna(4)
0 2.0
1 2.0
2 2.0
3 2.0
4 4.0
...
550063 1.0
550064 3.0
550065 4.0
550066 2.0
550067 4.0
Name: Stay_In_Current_City_Years, Length: 550068, dtype: float64
df['Stay_In_Current_City_Years']= pd.to_numeric(df['Stay_In_Current_City_Years'], errors = 'coerce').fillna(4)
df['Stay_In_Current_City_Years'].unique()
array([2., 4., 3., 1., 0.])
# tamanho do dataframe
len(df)
550068
df_1 = df.sample(frac = 0.8 , random_state = 10)
df_2 = df.drop(df_1.index)
len(df_1 ) + len(df_2)
550068
df_1.index.sort_values()
Int64Index([ 0, 1, 3, 4, 5, 7, 8, 9,
11, 12,
...
550055, 550056, 550058, 550059, 550060, 550061, 550063, 550065,
550066, 550067],
dtype='int64', length=440054)
df_2.index.sort_values()
Int64Index([ 2, 6, 10, 13, 15, 16, 18, 19,
21, 22,
...
550020, 550032, 550033, 550042, 550044, 550049, 550053, 550057,
550062, 550064],
dtype='int64', length=110014)
df = pd.read_csv('train.csv')
df.groupby(['Age','Gender']).Purchase.mean()
Age Gender
0-17 F 8338.771985
M 9235.173670
18-25 F 8343.180201
M 9440.942971
26-35 F 8728.251754
M 9410.337578
36-45 F 8959.844056
M 9453.193643
46-50 F 8842.098947
M 9357.471509
51-55 F 9042.449666
M 9705.094802
55+ F 9007.036199
M 9438.195603
Name: Purchase, dtype: float64
df.groupby(['Age','Gender']).Purchase.mean().unstack()
| Gender | F | M |
|---|---|---|
| Age | ||
| 0-17 | 8338.771985 | 9235.173670 |
| 18-25 | 8343.180201 | 9440.942971 |
| 26-35 | 8728.251754 | 9410.337578 |
| 36-45 | 8959.844056 | 9453.193643 |
| 46-50 | 8842.098947 | 9357.471509 |
| 51-55 | 9042.449666 | 9705.094802 |
| 55+ | 9007.036199 | 9438.195603 |
df.pivot_table(index = 'Age', columns = 'Gender', values = 'Purchase', aggfunc = 'mean')
| Gender | F | M |
|---|---|---|
| Age | ||
| 0-17 | 8338.771985 | 9235.173670 |
| 18-25 | 8343.180201 | 9440.942971 |
| 26-35 | 8728.251754 | 9410.337578 |
| 36-45 | 8959.844056 | 9453.193643 |
| 46-50 | 8842.098947 | 9357.471509 |
| 51-55 | 9042.449666 | 9705.094802 |
| 55+ | 9007.036199 | 9438.195603 |
df.pivot_table(index = 'Age', columns = 'Gender', values = 'Purchase', aggfunc = 'mean', margins = True)
| Gender | F | M | All |
|---|---|---|---|
| Age | |||
| 0-17 | 8338.771985 | 9235.173670 | 8933.464640 |
| 18-25 | 8343.180201 | 9440.942971 | 9169.663606 |
| 26-35 | 8728.251754 | 9410.337578 | 9252.690633 |
| 36-45 | 8959.844056 | 9453.193643 | 9331.350695 |
| 46-50 | 8842.098947 | 9357.471509 | 9208.625697 |
| 51-55 | 9042.449666 | 9705.094802 | 9534.808031 |
| 55+ | 9007.036199 | 9438.195603 | 9336.280459 |
| All | 8734.565765 | 9437.526040 | 9263.968713 |
df.Purchase.mean()
9263.968712959126
df[df.Age == '0-17']['Purchase'].mean()
8933.464640444974
df.Age.value_counts()
26-35 219587 36-45 110013 18-25 99660 46-50 45701 51-55 38501 55+ 21504 0-17 15102 Name: Age, dtype: int64
df.Age.value_counts(normalize = True)
26-35 0.399200 36-45 0.199999 18-25 0.181178 46-50 0.083082 51-55 0.069993 55+ 0.039093 0-17 0.027455 Name: Age, dtype: float64
9.1 Por index¶
df.iloc[:5, :6]
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | |
|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C |
Por labels das colunas¶
df.loc[ : , ['Age', 'Gender', 'Purchase']]
| Age | Gender | Purchase | |
|---|---|---|---|
| 0 | 0-17 | F | 8370 |
| 1 | 0-17 | F | 15200 |
| 2 | 0-17 | F | 1422 |
| 3 | 0-17 | F | 1057 |
| 4 | 55+ | M | 7969 |
| ... | ... | ... | ... |
| 550063 | 51-55 | M | 368 |
| 550064 | 26-35 | F | 371 |
| 550065 | 26-35 | F | 137 |
| 550066 | 55+ | F | 365 |
| 550067 | 46-50 | F | 490 |
550068 rows × 3 columns
df = pd.read_csv('train.csv')
df.head()
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
df['Stay_In_Current_City_Years'].unique()
array(['2', '4+', '3', '1', '0'], dtype=object)
Stay_In_Current_City_Years_values = {'0': 0, '1': 1, '2':2, '3':3 ,'4+':4}
df['Stay_In_Current_City_Years'].replace(Stay_In_Current_City_Years_values, inplace=True)
df.head()
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4 | 0 | 8 | NaN | NaN | 7969 |
df.head()
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4 | 0 | 8 | NaN | NaN | 7969 |
print('min:', df.Purchase.min())
print('mediana:', df.Purchase.median())
print('moda:', df.Purchase.mode()[0])
print('mean:', df.Purchase.mean())
print('max:', df.Purchase.max())
min: 12 mediana: 8047.0 moda: 7011 mean: 9263.968712959126 max: 23961
import matplotlib.pyplot as plt
df.Purchase.plot(kind='hist', title='Distribuição Vendas',
figsize=(10,6))
mean = df.Purchase.mean()
moda = df.Purchase.mode()[0]
mediana = df.Purchase.median()
plt.axvline(mean,color='b', linestyle='--')
plt.axvline(moda,color='green', linestyle='--')
plt.axvline(mediana,color='red', linestyle='--')
plt.legend({'Média':mean,'Moda':moda,'Mediana':mediana})
<matplotlib.legend.Legend at 0x1f3bcbf94c0>
pd.cut(df.Purchase, bins = [0, 5000, 15000,25000], labels = ['Pouco', 'Médio','Satisfatório'])
0 Médio
1 Satisfatório
2 Pouco
3 Pouco
4 Médio
...
550063 Pouco
550064 Pouco
550065 Pouco
550066 Pouco
550067 Pouco
Name: Purchase, Length: 550068, dtype: category
Categories (3, object): ['Pouco' < 'Médio' < 'Satisfatório']
df.shape
df['Purchase_labels'] = pd.cut(df.Purchase, bins = [0, 5000, 15000,25000], labels = ['Pouco', 'Médio','Satisfatório'])
df.head()
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | ... | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | Purchase_labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | ... | 0 | 3 | NaN | NaN | 8370 | Médio |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | ... | 0 | 1 | 6.0 | 14.0 | 15200 | Satisfatório |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | ... | 0 | 12 | NaN | NaN | 1422 | Pouco |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | ... | 0 | 12 | 14.0 | NaN | 1057 | Pouco |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | ... | 0 | 8 | NaN | NaN | 7969 | Médio |
5 rows × 13 columns
#!pip install pandas-profiling
import pandas_profiling
df = pd.read_csv('train.csv')
pandas_profiling.ProfileReport(df)
HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=25.0, style=ProgressStyle(descrip…
HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…
HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…